library(ggplot2)
library(plotly)
library(GGally)
Registered S3 method overwritten by 'GGally':
  method from   
  +.gg   ggplot2
df = read.csv('cps.csv')
head(df)
cat("Number of instances : ",nrow(df))
Number of instances :  534
cat("\nNumber of attributes : ",ncol(df))

Number of attributes :  11
str(df)
'data.frame':   534 obs. of  11 variables:
 $ wage    : num  9 5.5 3.8 10.5 15 9 9.57 15 11 5 ...
 $ educ    : int  10 12 12 12 12 16 12 14 8 12 ...
 $ race    : chr  "W" "W" "W" "W" ...
 $ sex     : chr  "M" "M" "F" "F" ...
 $ hispanic: chr  "NH" "NH" "NH" "NH" ...
 $ south   : chr  "NS" "NS" "NS" "NS" ...
 $ married : chr  "Married" "Married" "Single" "Married" ...
 $ exper   : int  27 20 4 29 40 27 5 22 42 14 ...
 $ union   : chr  "Not" "Not" "Not" "Not" ...
 $ age     : int  43 38 22 47 58 49 23 42 56 32 ...
 $ sector  : chr  "const" "sales" "sales" "clerical" ...

Find the distribution of wage

## Kernel Density Plot
density_plot = ggplot(df, aes(wage)) + geom_density(fill='indianred3') + 
              labs(x = 'wage', y = 'density', title = 'Kernal density of the brain weight')

density_plot

Find the distribution of wage with respect to race

## Bar-plot
bar_plot = plot_ly(data=df, x=~race, y=~wage, type='bar')
bar_plot

Is there a correlation between age and wage ??

x = df$wage ## numeric
y = df$age  ## integer
cat(cor(x, y, method = c("pearson")))
0.1769669

Does the wage differ with marital status ??

## Bar-plot
bar_plot = plot_ly(data=df, x=~married, y=~wage, color=~sex, type='bar')
bar_plot %>% layout(
                title = list(text = 'Age vs Wage'),
                legend = list(title = 'Gender'),
                xaxis = list(text = 'Age'),
                yaxis = list(text = 'Wage')
          )
Warning in RColorBrewer::brewer.pal(N, "Set2") :
  minimal value for n is 3, returning requested palette with 3 different levels

Warning in RColorBrewer::brewer.pal(N, "Set2") :
  minimal value for n is 3, returning requested palette with 3 different levels

Warning in RColorBrewer::brewer.pal(N, "Set2") :
  minimal value for n is 3, returning requested palette with 3 different levels

Warning in RColorBrewer::brewer.pal(N, "Set2") :
  minimal value for n is 3, returning requested palette with 3 different levels
  1. Using Iris Data-Set
  1. Plot the relation between sepal length and sepal width. Differentiate the different species and also show the variation in the sepal length in the graph
df2 = iris
plot = ggplot(data = df2, aes(x=Sepal.Length, y=Sepal.Width, color=Sepal.Length, shape=Species)) + geom_point() +
      labs(title = 'Sepal Length and Sepal Width',
           x = 'Sepal Length',
           y = 'Sepal Width')
plot

  1. Use subplots and plot the relationship between the different species and the other attributes
ggpairs(df2, columns=1:4, upper = 'blank', aes(color=Species)) + ggtitle('IRIS')

 plot: [1,1] [===>--------------------------------------------------------------------]  6% est: 0s 
 plot: [1,2] [========>---------------------------------------------------------------] 12% est: 0s 
 plot: [1,3] [=============>----------------------------------------------------------] 19% est: 0s 
 plot: [1,4] [=================>------------------------------------------------------] 25% est: 0s 
 plot: [2,1] [=====================>--------------------------------------------------] 31% est: 0s 
 plot: [2,2] [==========================>---------------------------------------------] 38% est: 0s 
 plot: [2,3] [===============================>----------------------------------------] 44% est: 0s 
 plot: [2,4] [===================================>------------------------------------] 50% est: 0s 
 plot: [3,1] [=======================================>--------------------------------] 56% est: 0s 
 plot: [3,2] [============================================>---------------------------] 62% est: 0s 
 plot: [3,3] [=================================================>----------------------] 69% est: 0s 
 plot: [3,4] [=====================================================>------------------] 75% est: 0s 
 plot: [4,1] [=========================================================>--------------] 81% est: 0s 
 plot: [4,2] [==============================================================>---------] 88% est: 0s 
 plot: [4,3] [===================================================================>----] 94% est: 0s 
 plot: [4,4] [========================================================================]100% est: 0s 
                                                                                                    

  1. Using the iris data-set
  1. Create a new column for Sepal Length to Petal Length ratio
df4 = iris

df4 %>% mutate(ratio = Sepal.Length / Petal.Length)
head(df4)
  1. Select all the columns except Species column
df4 %>% select(-Species)
  1. Select only the rows where Sepal width is greater than
df4 %>% filter(Sepal.Width > 3.5)
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3J9CmxpYnJhcnkoZ2dwbG90MikKbGlicmFyeShwbG90bHkpCmxpYnJhcnkoR0dhbGx5KQpgYGAKCmBgYHtyfQpkZiA9IHJlYWQuY3N2KCdjcHMuY3N2JykKaGVhZChkZikKYGBgCgpgYGB7cn0KY2F0KCJOdW1iZXIgb2YgaW5zdGFuY2VzIDogIixucm93KGRmKSkKY2F0KCJcbk51bWJlciBvZiBhdHRyaWJ1dGVzIDogIixuY29sKGRmKSkKYGBgCgpgYGB7cn0Kc3RyKGRmKQpgYGAKCkZpbmQgdGhlIGRpc3RyaWJ1dGlvbiBvZiB3YWdlCmBgYHtyfQojIyBLZXJuZWwgRGVuc2l0eSBQbG90CmRlbnNpdHlfcGxvdCA9IGdncGxvdChkZiwgYWVzKHdhZ2UpKSArIGdlb21fZGVuc2l0eShmaWxsPSdpbmRpYW5yZWQzJykgKyAKICAgICAgICAgICAgICBsYWJzKHggPSAnd2FnZScsIHkgPSAnZGVuc2l0eScsIHRpdGxlID0gJ0tlcm5hbCBkZW5zaXR5IG9mIHRoZSBicmFpbiB3ZWlnaHQnKQoKZGVuc2l0eV9wbG90CmBgYAoKRmluZCB0aGUgZGlzdHJpYnV0aW9uIG9mIHdhZ2Ugd2l0aCByZXNwZWN0IHRvIHJhY2UgCmBgYHtyfQojIyBCYXItcGxvdApiYXJfcGxvdCA9IHBsb3RfbHkoZGF0YT1kZiwgeD1+cmFjZSwgeT1+d2FnZSwgdHlwZT0nYmFyJykKYmFyX3Bsb3QKYGBgCgpJcyB0aGVyZSBhIGNvcnJlbGF0aW9uIGJldHdlZW4gYWdlIGFuZCB3YWdlID8/CmBgYHtyfQp4ID0gZGYkd2FnZSAjIyBudW1lcmljCnkgPSBkZiRhZ2UgICMjIGludGVnZXIKY2F0KGNvcih4LCB5LCBtZXRob2QgPSBjKCJwZWFyc29uIikpKQpgYGAKCkRvZXMgdGhlIHdhZ2UgZGlmZmVyIHdpdGggbWFyaXRhbCBzdGF0dXMgPz8KYGBge3J9CiMjIEJhci1wbG90CmJhcl9wbG90ID0gcGxvdF9seShkYXRhPWRmLCB4PX5tYXJyaWVkLCB5PX53YWdlLCBjb2xvcj1+c2V4LCB0eXBlPSdiYXInKQpiYXJfcGxvdCAlPiUgbGF5b3V0KAogICAgICAgICAgICAgICAgdGl0bGUgPSBsaXN0KHRleHQgPSAnQWdlIHZzIFdhZ2UnKSwKICAgICAgICAgICAgICAgIGxlZ2VuZCA9IGxpc3QodGl0bGUgPSAnR2VuZGVyJyksCiAgICAgICAgICAgICAgICB4YXhpcyA9IGxpc3QodGV4dCA9ICdBZ2UnKSwKICAgICAgICAgICAgICAgIHlheGlzID0gbGlzdCh0ZXh0ID0gJ1dhZ2UnKQogICAgICAgICAgKQpgYGAKCjIpIFVzaW5nIElyaXMgRGF0YS1TZXQKYSkgUGxvdCB0aGUgcmVsYXRpb24gYmV0d2VlbiBzZXBhbCBsZW5ndGggYW5kIHNlcGFsIHdpZHRoLiAKRGlmZmVyZW50aWF0ZSB0aGUgZGlmZmVyZW50IHNwZWNpZXMgYW5kIGFsc28gc2hvdyB0aGUgdmFyaWF0aW9uIGluIHRoZSBzZXBhbCBsZW5ndGggaW4gdGhlIGdyYXBoCmBgYHtyfQpkZjIgPSBpcmlzCnBsb3QgPSBnZ3Bsb3QoZGF0YSA9IGRmMiwgYWVzKHg9U2VwYWwuTGVuZ3RoLCB5PVNlcGFsLldpZHRoLCBjb2xvcj1TZXBhbC5MZW5ndGgsIHNoYXBlPVNwZWNpZXMpKSArIGdlb21fcG9pbnQoKSArCiAgICAgIGxhYnModGl0bGUgPSAnU2VwYWwgTGVuZ3RoIGFuZCBTZXBhbCBXaWR0aCcsCiAgICAgICAgICAgeCA9ICdTZXBhbCBMZW5ndGgnLAogICAgICAgICAgIHkgPSAnU2VwYWwgV2lkdGgnKQpwbG90CmBgYAoKYikgVXNlIHN1YnBsb3RzIGFuZCBwbG90IHRoZSByZWxhdGlvbnNoaXAgYmV0d2VlbiB0aGUgZGlmZmVyZW50IHNwZWNpZXMgYW5kIHRoZSBvdGhlciBhdHRyaWJ1dGVzCmBgYHtyfQpnZ3BhaXJzKGRmMiwgY29sdW1ucz0xOjQsIHVwcGVyID0gJ2JsYW5rJywgYWVzKGNvbG9yPVNwZWNpZXMpKSArIGdndGl0bGUoJ0lSSVMnKQpgYGAKCjQpIFVzaW5nIHRoZSBpcmlzIGRhdGEtc2V0CmEpIENyZWF0ZSBhIG5ldyBjb2x1bW4gZm9yIFNlcGFsIExlbmd0aCB0byBQZXRhbCBMZW5ndGggcmF0aW8KYGBge3J9CmRmNCA9IGlyaXMKCmRmNCAlPiUgbXV0YXRlKHJhdGlvID0gU2VwYWwuTGVuZ3RoIC8gUGV0YWwuTGVuZ3RoKQpoZWFkKGRmNCkKYGBgCmIpIFNlbGVjdCBhbGwgdGhlIGNvbHVtbnMgZXhjZXB0IFNwZWNpZXMgY29sdW1uCmBgYHtyfQpkZjQgJT4lIHNlbGVjdCgtU3BlY2llcykKYGBgCgpjKSBTZWxlY3Qgb25seSB0aGUgcm93cyB3aGVyZSBTZXBhbCB3aWR0aCBpcyBncmVhdGVyIHRoYW4gCmBgYHtyfQpkZjQgJT4lIGZpbHRlcihTZXBhbC5XaWR0aCA+IDMuNSkKYGBgCgo=